# global options
#knitr::opts_chunk$set(echo=FALSE)
options(DT.options = list(scrollX = TRUE, paging=TRUE, fixedHeader=TRUE, searchHighlight = TRUE))
#load packages
#default
library(DataExplorer);library(data.table);library(dlookr)
library(extrafont);library(formattable);library(GGally);library(here)
library(janitor);library(lubridate);library(naniar)
library(patchwork);library(PerformanceAnalytics)
library(plotly);library(RColorBrewer);library(readxl)
library(skimr);library(tidyverse);library(scales)
#rmarkdown
library(kableExtra)
#ml
library(caret);library(tidymodels);library(h2o);library(glmnet)

Get & Split the Data

In one piped statement:

  1. read in data
  2. convert char to factor vars
  3. rename all colnames lowercase
  4. order cols by name: alphabetically
  5. order cols by datatype: nominal, then numeric
a = read_csv('train.csv') %>%
  mutate(across(where(is.character),as.factor)) %>%
  clean_names(.) %>% select(sort(tidyselect::peek_vars())) %>%
  select(where(is.factor), where(is.numeric))

split = a %>% initial_split(strata = sale_price)
train = rsample::training(split)
test = rsample::testing(split)

EDA: nom vars

check head rows

train %>% select(where(is.factor)) %>% head %>% DT::datatable()

glimpse structure

train %>% select(where(is.factor)) %>% glimpse
## Rows: 1,097
## Columns: 43
## $ alley          <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ bldg_type      <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2fmCon, 1F...
## $ bsmt_cond      <fct> TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, ...
## $ bsmt_exposure  <fct> No, Mn, No, No, Av, Mn, No, No, No, No, Av, No, No, ...
## $ bsmt_fin_type1 <fct> GLQ, GLQ, ALQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec, GLQ, Un...
## $ bsmt_fin_type2 <fct> Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf, Unf, Un...
## $ bsmt_qual      <fct> Gd, Gd, TA, Gd, Ex, Gd, TA, TA, TA, Ex, Gd, TA, TA, ...
## $ central_air    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y...
## $ condition1     <fct> Norm, Norm, Norm, Norm, Norm, PosN, Artery, Artery, ...
## $ condition2     <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Artery, No...
## $ electrical     <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, FuseF, SBr...
## $ exter_cond     <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, ...
## $ exter_qual     <fct> Gd, Gd, TA, TA, Gd, TA, TA, TA, TA, Ex, Gd, TA, TA, ...
## $ exterior1st    <fct> VinylSd, VinylSd, Wd Sdng, VinylSd, VinylSd, HdBoard...
## $ exterior2nd    <fct> VinylSd, VinylSd, Wd Shng, VinylSd, VinylSd, HdBoard...
## $ fence          <fct> NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, NA, GdWo,...
## $ fireplace_qu   <fct> NA, TA, Gd, NA, Gd, TA, TA, TA, NA, Gd, Gd, Fa, NA, ...
## $ foundation     <fct> PConc, PConc, BrkTil, Wood, PConc, CBlock, BrkTil, B...
## $ functional     <fct> Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ, Typ, T...
## $ garage_cond    <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, ...
## $ garage_finish  <fct> RFn, RFn, Unf, Unf, RFn, RFn, Unf, RFn, Unf, Fin, RF...
## $ garage_qual    <fct> TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, TA, TA, ...
## $ garage_type    <fct> Attchd, Attchd, Detchd, Attchd, Attchd, Attchd, Detc...
## $ heating        <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA...
## $ heating_qc     <fct> Ex, Ex, Gd, Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, TA, Ex, ...
## $ house_style    <fct> 2Story, 2Story, 2Story, 1.5Fin, 1Story, 2Story, 1.5F...
## $ kitchen_qual   <fct> Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, Gd, TA, TA, ...
## $ land_contour   <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lv...
## $ land_slope     <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gt...
## $ lot_config     <fct> Inside, Inside, Corner, Inside, Inside, Corner, Insi...
## $ lot_shape      <fct> Reg, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, IR1, IR...
## $ mas_vnr_type   <fct> BrkFace, BrkFace, None, None, Stone, Stone, None, No...
## $ misc_feature   <fct> NA, NA, NA, Shed, NA, Shed, NA, NA, NA, NA, NA, NA, ...
## $ ms_zoning      <fct> RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL, RM, ...
## $ neighborhood   <fct> CollgCr, CollgCr, Crawfor, Mitchel, Somerst, NWAmes,...
## $ paved_drive    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y...
## $ pool_qc        <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ roof_matl      <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg...
## $ roof_style     <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gab...
## $ sale_condition <fct> Normal, Normal, Abnorml, Normal, Normal, Normal, Abn...
## $ sale_type      <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, New, New, WD, WD...
## $ street         <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave...
## $ utilities      <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllP...

check for missing values

(miss_var_summary(train %>% select(where(is.factor))) %>% DT::datatable(options = list(scrollY = '500px')))

notes about nom vars with missing data

  1. PoolQC: Pool quality
  2. MiscFeature: Miscellaneous feature not covered in other categories (e.g. elevator, 2nd garage, tennis court)
  3. Alley: Type of alley access to property
  4. Fence: Fence quality
  5. FireplaceQu: Fireplace quality

distribution: counts of unique levels

sapply(train %>% select(where(is.factor)), n_unique) %>% as.data.frame() %>% arrange(desc(.)) %>% DT::datatable()

reference: names of unique levels

sapply(train %>% select(where(is.factor)), unique)
## $alley
## [1] <NA> Pave Grvl
## Levels: Grvl Pave
## 
## $bldg_type
## [1] 1Fam   2fmCon Duplex TwnhsE Twnhs 
## Levels: 1Fam 2fmCon Duplex Twnhs TwnhsE
## 
## $bsmt_cond
## [1] TA   Gd   <NA> Fa   Po  
## Levels: Fa Gd Po TA
## 
## $bsmt_exposure
## [1] No   Mn   Av   <NA> Gd  
## Levels: Av Gd Mn No
## 
## $bsmt_fin_type1
## [1] GLQ  ALQ  Unf  Rec  BLQ  <NA> LwQ 
## Levels: ALQ BLQ GLQ LwQ Rec Unf
## 
## $bsmt_fin_type2
## [1] Unf  BLQ  <NA> ALQ  Rec  LwQ  GLQ 
## Levels: ALQ BLQ GLQ LwQ Rec Unf
## 
## $bsmt_qual
## [1] Gd   TA   Ex   <NA> Fa  
## Levels: Ex Fa Gd TA
## 
## $central_air
## [1] Y N
## Levels: N Y
## 
## $condition1
## [1] Norm   PosN   Artery Feedr  RRNn   RRAe   RRAn   RRNe   PosA  
## Levels: Artery Feedr Norm PosA PosN RRAe RRAn RRNe RRNn
## 
## $condition2
## [1] Norm   Artery RRNn   Feedr  PosA   PosN   RRAn   RRAe  
## Levels: Artery Feedr Norm PosA PosN RRAe RRAn RRNn
## 
## $electrical
## [1] SBrkr FuseF FuseA FuseP Mix  
## Levels: FuseA FuseF FuseP Mix SBrkr
## 
## $exter_cond
## [1] TA Gd Fa Ex
## Levels: Ex Fa Gd Po TA
## 
## $exter_qual
## [1] Gd TA Ex Fa
## Levels: Ex Fa Gd TA
## 
## $exterior1st
##  [1] VinylSd Wd Sdng HdBoard BrkFace MetalSd WdShing CemntBd Plywood AsbShng
## [10] Stucco  BrkComm AsphShn Stone   ImStucc CBlock 
## 15 Levels: AsbShng AsphShn BrkComm BrkFace CBlock CemntBd HdBoard ... WdShing
## 
## $exterior2nd
##  [1] VinylSd Wd Shng HdBoard MetalSd Wd Sdng Plywood CmentBd BrkFace Stucco 
## [10] AsbShng ImStucc AsphShn Brk Cmn Stone   Other   CBlock 
## 16 Levels: AsbShng AsphShn Brk Cmn BrkFace CBlock CmentBd HdBoard ... Wd Shng
## 
## $fence
## [1] <NA>  MnPrv GdWo  GdPrv MnWw 
## Levels: GdPrv GdWo MnPrv MnWw
## 
## $fireplace_qu
## [1] <NA> TA   Gd   Fa   Po   Ex  
## Levels: Ex Fa Gd Po TA
## 
## $foundation
## [1] PConc  BrkTil Wood   CBlock Slab   Stone 
## Levels: BrkTil CBlock PConc Slab Stone Wood
## 
## $functional
## [1] Typ  Min1 Maj1 Min2 Mod  Maj2
## Levels: Maj1 Maj2 Min1 Min2 Mod Sev Typ
## 
## $garage_cond
## [1] TA   Fa   <NA> Gd   Po   Ex  
## Levels: Ex Fa Gd Po TA
## 
## $garage_finish
## [1] RFn  Unf  Fin  <NA>
## Levels: Fin RFn Unf
## 
## $garage_qual
## [1] TA   Fa   Gd   <NA> Ex   Po  
## Levels: Ex Fa Gd Po TA
## 
## $garage_type
## [1] Attchd  Detchd  BuiltIn CarPort <NA>    Basment 2Types 
## Levels: 2Types Attchd Basment BuiltIn CarPort Detchd
## 
## $heating
## [1] GasA  GasW  Grav  Wall  OthW  Floor
## Levels: Floor GasA GasW Grav OthW Wall
## 
## $heating_qc
## [1] Ex Gd TA Fa
## Levels: Ex Fa Gd Po TA
## 
## $house_style
## [1] 2Story 1.5Fin 1Story 1.5Unf SFoyer 2.5Unf SLvl   2.5Fin
## Levels: 1.5Fin 1.5Unf 1Story 2.5Fin 2.5Unf 2Story SFoyer SLvl
## 
## $kitchen_qual
## [1] Gd TA Ex Fa
## Levels: Ex Fa Gd TA
## 
## $land_contour
## [1] Lvl Bnk Low HLS
## Levels: Bnk HLS Low Lvl
## 
## $land_slope
## [1] Gtl Mod Sev
## Levels: Gtl Mod Sev
## 
## $lot_config
## [1] Inside  Corner  CulDSac FR2     FR3    
## Levels: Corner CulDSac FR2 FR3 Inside
## 
## $lot_shape
## [1] Reg IR1 IR2 IR3
## Levels: IR1 IR2 IR3 Reg
## 
## $mas_vnr_type
## [1] BrkFace None    Stone   BrkCmn  <NA>   
## Levels: BrkCmn BrkFace None Stone
## 
## $misc_feature
## [1] <NA> Shed Gar2 Othr
## Levels: Gar2 Othr Shed TenC
## 
## $ms_zoning
## [1] RL      RM      C (all) FV      RH     
## Levels: C (all) FV RH RL RM
## 
## $neighborhood
##  [1] CollgCr Crawfor Mitchel Somerst NWAmes  OldTown BrkSide Sawyer  NridgHt
## [10] NAmes   MeadowV IDOTRR  Edwards Timber  SawyerW Veenker ClearCr Gilbert
## [19] NoRidge NPkVill StoneBr Blmngtn BrDale  SWISU   Blueste
## 25 Levels: Blmngtn Blueste BrDale BrkSide ClearCr CollgCr Crawfor ... Veenker
## 
## $paved_drive
## [1] Y N P
## Levels: N P Y
## 
## $pool_qc
## [1] <NA> Ex   Fa   Gd  
## Levels: Ex Fa Gd
## 
## $roof_matl
## [1] CompShg WdShngl Metal   WdShake Membran Tar&Grv Roll   
## Levels: ClyTile CompShg Membran Metal Roll Tar&Grv WdShake WdShngl
## 
## $roof_style
## [1] Gable   Hip     Gambrel Mansard Flat    Shed   
## Levels: Flat Gable Gambrel Hip Mansard Shed
## 
## $sale_condition
## [1] Normal  Abnorml Partial AdjLand Family  Alloca 
## Levels: Abnorml AdjLand Alloca Family Normal Partial
## 
## $sale_type
## [1] WD    New   COD   ConLD ConLI Con   ConLw CWD   Oth  
## Levels: COD Con ConLD ConLI ConLw CWD New Oth WD
## 
## $street
## [1] Pave Grvl
## Levels: Grvl Pave
## 
## $utilities
## [1] AllPub NoSeWa
## Levels: AllPub NoSeWa

distribution: viz

DataExplorer::plot_bar(train %>% select(where(is.factor)))

EDA: num vars

check head rows

train %>% select(where(is.numeric)) %>% head %>% DT::datatable()

glimpse structure

train %>% select(where(is.numeric)) %>% glimpse
## Rows: 1,097
## Columns: 37
## $ bedroom_abv_gr  <dbl> 3, 3, 3, 1, 3, 3, 2, 2, 3, 4, 3, 2, 2, 2, 3, 3, 3, ...
## $ bsmt_fin_sf1    <dbl> 706, 486, 216, 732, 1369, 859, 0, 851, 906, 998, 0,...
## $ bsmt_fin_sf2    <dbl> 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bsmt_full_bath  <dbl> 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, ...
## $ bsmt_half_bath  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ bsmt_unf_sf     <dbl> 150, 434, 540, 64, 317, 216, 952, 140, 134, 177, 14...
## $ enclosed_porch  <dbl> 0, 0, 272, 0, 0, 228, 205, 0, 0, 0, 0, 176, 0, 0, 0...
## $ fireplaces      <dbl> 0, 1, 1, 0, 1, 2, 2, 2, 0, 2, 1, 1, 0, 0, 0, 1, 1, ...
## $ full_bath       <dbl> 2, 2, 1, 1, 2, 2, 2, 1, 1, 3, 2, 1, 1, 2, 1, 2, 1, ...
## $ garage_area     <dbl> 548, 608, 642, 480, 636, 484, 468, 205, 384, 736, 8...
## $ garage_cars     <dbl> 2, 2, 3, 2, 2, 2, 2, 1, 1, 3, 3, 1, 2, 2, 1, 2, 2, ...
## $ garage_yr_blt   <dbl> 2003, 2001, 1998, 1993, 2004, 1973, 1931, 1939, 196...
## $ gr_liv_area     <dbl> 1710, 1786, 1717, 1362, 1694, 2090, 1774, 1077, 104...
## $ half_bath       <dbl> 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ kitchen_abv_gr  <dbl> 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, ...
## $ lot_area        <dbl> 8450, 11250, 9550, 14115, 10084, 10382, 6120, 7420,...
## $ lot_frontage    <dbl> 65, 68, 60, 85, 75, NA, 51, 50, 70, 85, 91, NA, 51,...
## $ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mas_vnr_area    <dbl> 196, 162, 0, 0, 186, 240, 0, 0, 0, 286, 306, 212, 0...
## $ misc_val        <dbl> 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 500, 0, ...
## $ mo_sold         <dbl> 2, 9, 2, 10, 8, 11, 4, 1, 2, 7, 8, 5, 7, 10, 5, 9, ...
## $ ms_sub_class    <dbl> 60, 60, 70, 50, 20, 60, 50, 190, 20, 60, 20, 20, 45...
## $ open_porch_sf   <dbl> 61, 42, 35, 30, 57, 204, 0, 4, 0, 21, 33, 213, 112,...
## $ overall_cond    <dbl> 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 5, 5, 8, 5, 6, 5, 7, ...
## $ overall_qual    <dbl> 7, 7, 7, 5, 8, 7, 7, 5, 5, 9, 7, 6, 7, 4, 5, 8, 5, ...
## $ pool_area       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ sale_price      <dbl> 208500, 223500, 140000, 143000, 307000, 200000, 129...
## $ screen_porch    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tot_rms_abv_grd <dbl> 8, 6, 7, 5, 7, 7, 8, 5, 5, 11, 7, 5, 5, 6, 6, 7, 6,...
## $ total_bsmt_sf   <dbl> 856, 920, 756, 796, 1686, 1107, 952, 991, 1040, 117...
## $ wood_deck_sf    <dbl> 0, 0, 0, 40, 255, 235, 90, 0, 0, 147, 160, 0, 48, 0...
## $ x1st_flr_sf     <dbl> 856, 920, 961, 796, 1694, 1107, 1022, 1077, 1040, 1...
## $ x2nd_flr_sf     <dbl> 854, 866, 756, 566, 0, 983, 752, 0, 0, 1142, 0, 0, ...
## $ x3ssn_porch     <dbl> 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ year_built      <dbl> 2003, 2001, 1915, 1993, 2004, 1973, 1931, 1939, 196...
## $ year_remod_add  <dbl> 2003, 2002, 1970, 1995, 2005, 1973, 1950, 1950, 196...
## $ yr_sold         <dbl> 2008, 2008, 2006, 2009, 2007, 2009, 2008, 2008, 200...

check for missing values

(miss_var_summary(train %>% select(where(is.numeric))) %>% DT::datatable(options = list(scrollY = '500px')))

notes about num vars with missing data

  1. LotFrontage: Linear feet of street connected to property
  2. GarageYrBlt: Year garage was built
  3. MasVnrArea: Masonry veneer area in square feet

distribution: viz

DataExplorer::plot_boxplot(train %>% select(where(is.numeric)), by = 'sale_price')
## Warning: Removed 55 rows containing non-finite values (stat_boxplot).

## Warning: Removed 197 rows containing non-finite values (stat_boxplot).

distribution: viz

DataExplorer::plot_histogram(train %>% select(where(is.numeric)))

distribution: viz

DataExplorer::plot_density(train %>% select(where(is.numeric)))

pairwise correlations: viz

GGally::ggcorr(train %>% select(where(is.numeric)), low = '#990000', mid = '#E0E0E0', high = '#009900', label = TRUE)

EDA: other

check distribution of response var in more detail: right skewed

train %>% ggplot(aes(sale_price)) + geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

train %>% ggplot(aes(sale_price)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(train$sale_price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35311  130000  163000  181841  213500  755000

check for normality in distribution of dependent var

qqnorm(train$sale_price)
qqline(train$sale_price, lwd = 2, col = 'darkred')

log10 transformation helps restore normality

qqnorm(log10(train$sale_price))
qqline(log10(train$sale_price), lwd = 2, col = 'darkred')

Preprocessing

Reference: (package recipes)[https://recipes.tidymodels.org/reference/index.html]

create preprocessing recipe

(recipe = train %>% recipe(sale_price ~ . ) %>%
    #data conversion
    step_mutate(mo_sold = factor(mo_sold)) %>%
    step_mutate(year_built = factor(year_built)) %>%
    step_mutate(year_remod_add = factor(year_remod_add)) %>%
    step_mutate(yr_sold = factor(yr_sold)) %>%
    step_mutate(garage_yr_blt = factor(garage_yr_blt)) %>% # * has missing data in test set   
   #data imputation: numeric
    step_medianimpute(lot_frontage, mas_vnr_area) %>%
    step_unknown(all_nominal(),-all_outcomes()) %>%   
    #handling potential multicollinearity via filtering
    step_corr(all_numeric(),-all_outcomes()) %>% 
    step_nzv(all_numeric(),-all_outcomes()) %>% 
    step_zv(all_numeric(),-all_outcomes()) %>% 
    #center and scale numeric data
    step_normalize(all_numeric(),-all_outcomes()) %>% 
   ##!!<NOTE> for these 2 vars, test ds has fewer levels than the train ds, so omit!
   step_naomit(year_built, garage_yr_blt)
    #dummy variable creation should occur AFTER normalization of numeric vars
   #step_dummy(all_nominal(),-all_outcomes(), one_hot = TRUE)
     ##!!<NOTE> this is annoying, but if I don't use it I keep getting missing vals in test set
   #step_pca(all_predictors(),-all_outcomes())
)
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         79
## 
## Operations:
## 
## Variable mutation for mo_sold
## Variable mutation for year_built
## Variable mutation for year_remod_add
## Variable mutation for yr_sold
## Variable mutation for garage_yr_blt
## Median Imputation for lot_frontage, mas_vnr_area
## Unknown factor level assignment for all_nominal(), -all_outcomes()
## Correlation filter on all_numeric(), -all_outcomes()
## Sparse, unbalanced variable filter on all_numeric(), -all_outcomes()
## Zero variance filter on all_numeric(), -all_outcomes()
## Centering and scaling for all_numeric(), -all_outcomes()
## Removing rows with NA values in year_built, garage_yr_blt

examine interaction of receipe with train ds

(recipe %>% prep())
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         79
## 
## Training data contained 1097 data points and 1097 incomplete rows. 
## 
## Operations:
## 
## Variable mutation for mo_sold [trained]
## Variable mutation for year_built [trained]
## Variable mutation for year_remod_add [trained]
## Variable mutation for yr_sold [trained]
## Variable mutation for garage_yr_blt [trained]
## Median Imputation for lot_frontage, mas_vnr_area [trained]
## Unknown factor level assignment for alley, bldg_type, bsmt_cond, ... [trained]
## Correlation filter removed no terms [trained]
## Sparse, unbalanced variable filter removed bsmt_fin_sf2, ... [trained]
## Zero variance filter removed no terms [trained]
## Centering and scaling for bedroom_abv_gr, bsmt_fin_sf1, ... [trained]
## Removing rows with NA values in year_built, garage_yr_blt

preprocess train ds & test ds

#'Using the recipe, prepare & juice the train ds'
juiced.train = recipe %>% prep(retain=TRUE ) %>% juice %>%
  select(sort(tidyselect::peek_vars())) %>% select(where(is.factor),where(is.numeric))

#'Using the recipe, prepare & bake the test ds'
baked.test = recipe %>% prep(retain=TRUE ) %>% bake(test) %>%
  select(sort(tidyselect::peek_vars())) %>% select(where(is.factor),where(is.numeric))

juiced.train %>% head() %>% DT::datatable()
baked.test %>% head %>% DT::datatable()

sanity check for missing data

miss_var_summary(juiced.train)
miss_var_summary(baked.test)

sanity check: nom vars - distribution: viz

juiced.train %>% select(where(is.factor)) %>% DataExplorer::plot_bar()
## 3 columns ignored with more than 50 categories.
## garage_yr_blt: 97 categories
## year_built: 110 categories
## year_remod_add: 61 categories

sanity check: num vars - distribution: viz

juiced.train %>% select(where(is.numeric)) %>% DataExplorer::plot_histogram()

distribution: viz

juiced.train %>% select(where(is.numeric)) %>% DataExplorer::plot_boxplot(by = 'sale_price')

sanity check: num vars - pairwise correlations: viz

GGally::ggcorr(juiced.train %>% select(where(is.numeric)), low = '#990000', mid = '#E0E0E0', high = '#009900', label = TRUE)

Simple Modeling

Random Forest: Modeling and Assessment

#!!<NOTE> to analyze var importance, you need an importance arg
# The 'impurity_corrected' importance measure is unbiased in terms of the number of categories and category frequencies and is almost as fast as the standard impurity importance. 
model.rf = ranger::ranger(log10(sale_price) ~ . , data = juiced.train, num.trees = 300, importance = 'impurity_corrected')

model.rf
## Ranger result
## 
## Call:
##  ranger::ranger(log10(sale_price) ~ ., data = juiced.train, num.trees = 300,      importance = "impurity_corrected") 
## 
## Type:                             Regression 
## Number of trees:                  300 
## Sample size:                      1097 
## Number of independent variables:  71 
## Mtry:                             8 
## Target node size:                 5 
## Variable importance mode:         impurity_corrected 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.004251166 
## R squared (OOB):                  0.8592599
#viz var importance
model.rf.var.imp = model.rf$variable.importance %>% as.matrix() %>% as.data.frame.matrix() %>% rownames_to_column() %>% rename(var = rowname, imp = V1) %>% arrange(-imp)

model.rf.var.imp %>% ggplot(aes(fct_reorder(var, imp), imp)) + geom_col() + coord_flip()

#make predictions
model.rf.preds = as.vector(predict(model.rf, baked.test, seed = 88)$predictions)
## Warning in predict.ranger(model.rf, baked.test, seed = 88): Forest was grown
## with 'impurity_corrected' variable importance. For prediction it is advised to
## grow another forest without this importance setting.
#y, pred
model.rf.df = tibble(y = baked.test$sale_price, preds = 10 ^ model.rf.preds) #undoing log transformation

#viz
model.rf.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'

#what is rmse of model? ~31k 
model.rf.df %>% yardstick::rmse(truth = y, estimate = preds)
#compare relative performance
#what does rmse look like rel. to sd of response var?
#what was sd of sales price? ~70 - 80k
sd(test$sale_price)
## [1] 73030.31

Elastic Net (LASSO and Ridge)

x.train = juiced.train %>% select(-sale_price) %>% data.matrix()
y.train = log10(juiced.train$sale_price)

x.test = baked.test %>% select(-sale_price) %>% data.matrix()
y.test = log10(baked.test$sale_price)

#---------------------------------------------------(0)ridge

model.ridge = cv.glmnet(
  x = x.train, y = y.train,
  type.measure = 'mse',
  alpha = 0, #ridge regression
  family = 'gaussian'
  )

model.ridge$lambda.1se #value of lambda that resulted in simplest model
## [1] 0.1006081
model.ridge$lambda.min #value of lambda that resulted in the smallest sum of squares
## [1] 0.01426093
model.ridge.preds = predict(model.ridge, s=model.ridge$lambda.min, new = x.test) ^ 10

#y, pred
model.ridge.df = tibble(y = y.test ^ 10, preds = as.vector(model.ridge.preds)) #undoing log transformation

#viz
model.ridge.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'

model.ridge.rmse = sqrt(mean((y.test - model.ridge.preds)^2))

paste0('ridge regression has a rmse of ', formattable::currency(model.ridge.rmse, digits = 0))
## [1] "ridge regression has a rmse of $17,899,151"
#---------------------------------------------------(1)lasso

model.lasso = cv.glmnet(
  x = x.train, y = y.train,
  type.measure = 'mse',
  alpha = 1, #lasso regression
  family = 'gaussian'
  )

model.lasso$lambda.1se #value of lambda that resulted in simplest model
## [1] 0.005007287
model.lasso$lambda.min #value of lambda that resulted in the smallest sum of squares
## [1] 0.001639661
model.lasso.preds = predict(model.lasso, s=model.lasso$lambda.min, new = x.test) ^ 10

#y, pred
model.lasso.df = tibble(y = y.test ^ 10, preds = as.vector(model.lasso.preds)) #undoing log transformation

#viz
model.lasso.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'

model.lasso.rmse = sqrt(mean((y.test - model.lasso.preds)^2))

paste0('lasso regression has a rmse of ', formattable::currency(model.lasso.rmse, digits = 0))
## [1] "lasso regression has a rmse of $17,966,379"
#---------------------------------------------------(0.5)elastic

model.elastic = cv.glmnet(
  x = x.train, y = y.train,
  type.measure = 'mse',
  alpha = 0.5, #elastic regression
  family = 'gaussian'
  )

model.elastic$lambda.1se #value of lambda that resulted in simplest model
## [1] 0.01001457
model.elastic$lambda.min #value of lambda that resulted in the smallest sum of squares
## [1] 0.002722551
model.elastic.preds = predict(model.elastic, s=model.elastic$lambda.min, new = x.test) ^ 10

#y, pred
model.elastic.df = tibble(y = y.test ^ 10, preds = as.vector(model.elastic.preds)) #undoing log transformation

#viz
model.elastic.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'

model.elastic.rmse = sqrt(mean((y.test - model.elastic.preds)^2))

paste0('elastic regression has a rmse of ', formattable::currency(model.elastic.rmse, digits = 0))
## [1] "elastic regression has a rmse of $17,943,946"

Simple Modeling Revisted: filtered to Top N VAR IMP

find top N most important vars

N = 50
(model.rf.var.imp.topN = model.rf.var.imp %>% head(N))

Random Forest Modeling and Assessment: High Important Vars Only

#filter data to top `r  N ` important vars
juiced.train.filt.imp.vars = juiced.train %>% select(sale_price, model.rf.var.imp.topN$var)

#create rf model
model.rf2 = ranger::ranger(log10(sale_price) ~ . , data = juiced.train.filt.imp.vars, num.trees = 300, importance = 'impurity_corrected')

#check out model
model.rf2
## Ranger result
## 
## Call:
##  ranger::ranger(log10(sale_price) ~ ., data = juiced.train.filt.imp.vars,      num.trees = 300, importance = "impurity_corrected") 
## 
## Type:                             Regression 
## Number of trees:                  300 
## Sample size:                      1097 
## Number of independent variables:  50 
## Mtry:                             7 
## Target node size:                 5 
## Variable importance mode:         impurity_corrected 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.00398237 
## R squared (OOB):                  0.8681587
#check out factor importance
model.rf2.var.imp = model.rf2$variable.importance %>% as.matrix() %>% as.data.frame.matrix() %>% rownames_to_column() %>% rename(var = rowname, imp = V1) %>% arrange(-imp)

model.rf2.var.imp %>% ggplot(aes(fct_reorder(var, imp), imp)) + geom_col() + coord_flip()

#create preds
model.rf2.preds = as.vector(predict(model.rf2, baked.test, seed = 88)$predictions)
## Warning in predict.ranger(model.rf2, baked.test, seed = 88): Forest was grown
## with 'impurity_corrected' variable importance. For prediction it is advised to
## grow another forest without this importance setting.
#y, pred
model.rf2.df = tibble(y = baked.test$sale_price, preds = 10 ^ model.rf2.preds) #undoing log transformation

#viz
model.rf.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'

#what is rmse of model? ~32k 
model.rf2.df %>% yardstick::rmse(truth = y, estimate = preds)

Commentary

  1. When using just the top 50 variables, overall performance doesn’t change much
  2. Simpler models are faster, and easier to understand, so it’s worth filtering down to just these variables in all modeling

Intermediate Modeling

Elastic Net (LASSO and Ridge)

#elastic net introduction: https://sciphy-stats.com/post/2019-01-25-finalizing-glmnet-models/
#troubleshooting NA RMSE: https://stackoverflow.com/questions/51548255/caret-there-were-missing-values-in-resampled-performance-measures

#10 fold repeatedcv
trControl = caret::trainControl(method = 'cv', number = 10)

#hyperparameter grid
tuneGrid = expand.grid(
  lambda = c(seq(0.01,.2,.005)),
  alpha = 0 #ridge
)

model2.ridge = caret::train(
  x = x.train, y = y.train,
  metric = 'RMSE',
  method = 'glmnet',
  trControl = trControl,
  tuneGrid = tuneGrid
)

#pluck best hyperparameter
bestTune = purrr::pluck(model2.ridge, 'bestTune')

# Plot hyperparameter Lambda against RMSE
plot(model2.ridge)

#finalize model using best hyperparameter

model2.ridge = glmnet::glmnet(
  x = x.train, y = y.train,
  family = 'gaussian',
  alpha = bestTune$alpha, lambda = bestTune$lambda
)

broom::tidy(model2.ridge) %>% select(term, estimate) %>% 
  mutate_at('estimate', round, 2) %>% arrange(-estimate) %>% DT::datatable()
model2.ridge.preds = predict(model2.ridge, x.test) ^ 10

#y, pred
model2.ridge.df = tibble(y = y.test ^ 10, preds = as.vector(model2.ridge.preds)) #undoing log transformation

#viz
model2.ridge.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'

model2.ridge.rmse = sqrt(mean((y.test - model2.ridge.preds)^2))

paste0('ridge regression has a rmse of ', formattable::currency(model2.ridge.rmse, digits = 0))
## [1] "ridge regression has a rmse of $17,986,152"
model2.ridge.var.imp = coef(model2.ridge) %>% as.data.frame.matrix() %>% rename(coef = s0) %>% arrange(-coef) 

model2.ridge.var.imp %>% DT::datatable()

AutoML

(h2o AutoML documentation)[https://docs.h2o.ai/h2o-tutorials/latest-stable/h2o-world-2017/automl/index.html]

start h2o

## #preprocess entire dataset
## h2o.df = recipe %>% prep(retain=TRUE) %>% bake(a)
## 
## #select only the vars that were considered important
## h2o.df = h2o.df %>% select(sale_price, model.rf.var.imp.topN$var)
## #check for missing vars
## miss_var_summary(h2o.df)
## # Initialise h2o cluster
## h2o.init()
## # Convert data to h2o frame
## h2o.hf <- as.h2o(h2o.df)
## # describe dataset
## h2o.describe(h2o.hf)
## # identify target and features
## y <- 'sale_price'
## x <- setdiff(colnames(h2o.df), y)
## # split data into train & validation sets
## sframe <- h2o.splitFrame(h2o.hf, seed = 88)
## train <- sframe[[1]]
## test <- sframe[[2]]
## 
## aml <- h2o.automl(y = y,
##                   training_frame = train,
##                   leaderboard_frame = test,
##                   max_runtime_secs = 30,
##                   seed = 88
##                   #project_name = "XXX"
##                   )
## 
## #view leaderboard
## print(aml@leaderboard)
## 
## #make predictions
## aml.preds = predict(aml@leader, test)

automl leader model assessment

## print(h2o.performance(aml@leader, test)) #23967.5